transform

old TransE-like models
git clone https://esimon.eu/repos/transform.git
Log | Files | Refs | README

build Bordes FB15k.py (3179B)


      1 #!/usr/bin/env python2
      2 
      3 from __future__ import print_function
      4 import sys
      5 import os
      6 from log import log
      7 
      8 urls = [ 'https://www.hds.utc.fr/everest/lib/exe/fetch.php?id=en%3Atranse&cache=cache&media=en:fb15k.tgz' ]
      9 
     10 def get_archive(path):
     11     import urllib
     12 
     13     class URLopener(urllib.FancyURLopener):
     14           def http_error_default(self, url, fp, errcode, errmsg, headers):
     15               print('Error: {0} {1}'.format(errcode, errmsg), file=sys.stderr)
     16               raise IOError
     17 
     18     archive = path+'/archive.tgz'
     19     downloaded = False
     20     for url in urls:
     21         log('Downloading dataset from "{0}"...'.format(url))
     22         try:
     23             URLopener().retrieve(url, archive)
     24             downloaded = True
     25             log(' done\n')
     26         except IOError:
     27             pass
     28 
     29     if not downloaded:
     30         print('Error: Unable to download dataset.', file=sys.stderr)
     31         sys.exit(1)
     32 
     33 def get_raw(path):
     34     if os.path.isdir(path+'/raw'):
     35         return
     36 
     37     get_archive(path)
     38 
     39     log('Raw files not found, extracting archive...')
     40     raw = path+'/raw'
     41     os.mkdir(raw)
     42 
     43     import tarfile
     44     tar = tarfile.open(path+'/archive.tgz', 'r:gz')
     45     tar.extractall(raw)
     46     log(' done\n')
     47 
     48 def compile_dataset(path):
     49     get_raw(path)
     50     prefix = path+'/raw/FB15k/freebase_mtr100_mte100-'
     51     suffix = '.txt'
     52 
     53     log('Reading train file...')
     54     with open(prefix+'train'+suffix, 'r') as file:
     55         content = map(lambda line: line.rstrip('\n').split('\t'), file.readlines())
     56         [left, relations, right] = map(set, zip(*content))
     57     entities = left | right
     58     log(' done\n')
     59 
     60     log('Writting entities...')
     61     e2i, i2e, r2i, i2r = {}, {}, {}, {}
     62     with open(path+'/entities', 'w') as file:
     63         i=0
     64         for entity in entities:
     65             e2i[entity]=i
     66             i2e[i]=entity
     67             file.write(entity+'\n')
     68             i+=1
     69     log(' done ({0} entities written)\n'.format(i))
     70 
     71     log('Writting relations...')
     72     with open(path+'/relations', 'w') as file:
     73         i=0
     74         for relation in relations:
     75             r2i[relation]=i
     76             i2r[i]=relation
     77             file.write(relation+'\n')
     78             i+=1
     79     log(' done ({0} relations written)\n'.format(i))
     80 
     81     for name in ['train', 'valid', 'test']:
     82         log('Compiling {0}...'.format(name))
     83         count = 0
     84         with open(prefix+name+suffix, 'r') as infile:
     85             with open(path+'/'+name, 'w') as outfile:
     86                 for line in infile.readlines():
     87                     left, relation, right = line.rstrip('\n').split('\t')
     88                     if left in e2i and right in e2i and relation in r2i:
     89                         outfile.write('{0}\t{1}\t{2}\n'.format(e2i[left], r2i[relation], e2i[right]))
     90                     else:
     91                         count+=1
     92         log(' done ({0} entit{1} removed)\n'.format(count, 'y' if count<2 else 'ies'))
     93 
     94 if __name__ == '__main__':
     95     if len(sys.argv)<2:
     96         print('Usage: {0} path'.format(sys.argv[0]), file=sys.stderr)
     97         sys.exit(1)
     98 
     99     path = sys.argv[1]
    100     if not os.path.isdir(path):
    101         os.mkdir(path)
    102 
    103     compile_dataset(path)
    104     log('Bordes FB15k was successfully built in {0}\n'.format(path))